Open In Colab

1. Two-dimensional data¶

In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import pandas as pd
from typing import List
from numpy.linalg import norm
from IPython.core.formatters import Dict
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
import numpy as np
from matplotlib.patches import Ellipse
import pickle
In [ ]:
# Generate the 2D dataset using sklearn
# Create the moon datasets of varying sizes
n_samples_small = 100
n_samples_medium = 500
n_samples_large = 1000

#let's plot the dataset["medium"]
# X, _ = make_moons(n_samples=500, noise=0.05, random_state=0)
Xsmall,_=make_moons(n_samples=n_samples_small, noise=0.05, random_state=0)
Xmid,_=make_moons(n_samples=n_samples_medium, noise=0.05, random_state=0)
Xlarge,_=make_moons(n_samples=n_samples_large, noise=0.05, random_state=0)

datasets=datasets = {
    "Small": Xsmall,
    "Medium": Xmid,
    "Large": Xlarge,
}
# Plot any of the dataset
plt.figure(figsize=(8, 6))
plt.scatter(datasets["Medium"][:,0], datasets["Medium"][:,1], c='blue', label='make_moons dataset', alpha=0.5)
Out[ ]:
<matplotlib.collections.PathCollection at 0x7c741e2df6a0>

1.1 Histogram¶

In [ ]:
def train_histogram(X, bins=20):
    x_min, x_max = X[:, 0].min(), X[:, 0].max()
    y_min, y_max = X[:, 1].min(), X[:, 1].max()

    x_bins = np.linspace(x_min, x_max, bins + 1)
    y_bins = np.linspace(y_min, y_max, bins + 1)

    histogram = np.zeros((bins, bins), dtype=int)

    for i in range(bins):
        for j in range(bins):
            x_condition = np.logical_and(x_bins[i] <= X[:, 0], X[:, 0] < x_bins[i + 1])
            y_condition = np.logical_and(y_bins[j] <= X[:, 1], X[:, 1] < y_bins[j + 1])
            bin_count = np.sum(np.logical_and(x_condition, y_condition))
            histogram[i, j] = bin_count

    return histogram, x_bins, y_bins

# Train histogram for a specific dataset (e.g., the small one)
X_small = datasets["Large"]
histogram_small, x_bins, y_bins = train_histogram(X_small)

# You can visualize the histogram if needed
plt.imshow(histogram_small, origin='lower', cmap='Blues')
plt.colorbar()
plt.title('2D Histogram')
plt.show()
In [ ]:
def generate_samples_from_histogram(histogram, x_bins, y_bins, num_samples):
    # Normalize the histogram to obtain a probability distribution
    histogram_normalized = histogram / np.sum(histogram)

    # Get the shape of the histogram
    num_x_bins, num_y_bins = histogram.shape

    # Generate random indices based on the normalized histogram
    random_indices = np.unravel_index(
        np.random.choice(histogram.size, size=num_samples, p=histogram_normalized.ravel()),
        (num_x_bins, num_y_bins)
    )

    # Map the random indices to bin centers
    x_min, x_max = x_bins[0], x_bins[-1]
    y_min, y_max = y_bins[0], y_bins[-1]
    x_bin_centers = (x_bins[:-1] + x_bins[1:]) / 2
    y_bin_centers = (y_bins[:-1] + y_bins[1:]) / 2
    x_samples = x_bin_centers[random_indices[0]]
    y_samples = y_bin_centers[random_indices[1]]

    return np.column_stack((x_samples, y_samples))

# Example of generating data samples from the small histogram
num_samples_to_generate = 1000
generated_samples = generate_samples_from_histogram(histogram_small, x_bins, y_bins, num_samples_to_generate)

plt.scatter(generated_samples[:, 0], generated_samples[:, 1], c='red', marker='x', label='Generated Samples')

# You can also overlay the original data points for comparison
plt.scatter(X_small[:, 0], X_small[:, 1], c='blue', marker='o', label='Original Data')

# Set labels and legend
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.legend()

# Show the plot
plt.show()
Comment
  1. Conceptual difference in implementation:

    1.1. The sample solution have implemented a grid-based approach. It defines bin edges along both dimensions (x and y) and counts data points falling into each bin. The histogram is constructed based on the occurrences within each bin. 1.2. In our solution, we have implemented a range-based histogram. It counts occurrences by checking each data point against bin edges.It uses conditions to determine the appropriate bin for each data point.

    The grid based appraoch provides a more regular and structured representation and is suitable for capturing overall trends, whereas the range based approach allows to capture the specific characteristics of the data distribution more thoroughly.

  2. In the figure above, we have considered the a dataset with 1000 datapoints (We would like to point out that there is some mistake in naming the variable above. We named our dataset X_small even though we have considered the case of large dataset (number of datapoints=1000). We apologise for this mistake). Our results well with the sample solution's plot.

1.2 Single Gaussian Distribution model¶

In [ ]:
### define n- dimensional gaussian distribution
def gaussian_multivar(x:np.array,mean:np.array,cov_mat:np.array):
  '''
  This model will evaluate the probabilty at the given x_vector using the GMM model.
  Args:
  1. x= np array of shape (1 X dimn_of_the_problem)
  2. mean= np array of shape (1 X dimn_of_the_problem)
  3.cov_mat= square np array of shape (dimn_of_the_prob)
  '''
  dimn=np.shape(x)[0]
  inv_cov=np.linalg.inv(cov_mat)
  x_minus_mu=x-mean
  # argument for the exp
  arg_for_expn=0.5*(x_minus_mu @ inv_cov @ np.transpose(x_minus_mu))[0,0]
  det_cov=np.linalg.det(cov_mat)
  # evaluating the prob
  prob=np.exp(-arg_for_expn) * (1./(np.sqrt(det_cov*((2*np.pi)**dimn))))

  return prob
In [ ]:
import numpy as np
from scipy.linalg import cholesky
from scipy.special import erfinv

def sample_multivariate_gaussian(mean, covariance, num_samples=1):
    """
    Sample from an n-dimensional Gaussian distribution.

    Args:
        mean: Mean vector (1D array) of length n. (1,)
        covariance: Covariance matrix (2D array) of shape (n, n).
        num_samples: Number of samples to generate (default is 1).

    Returns:
        samples: Array of shape (num_samples, n) containing generated samples.
    """
    n = len(mean)

    # Generate random vectors 'u' from a uniform distribution
    u = np.random.rand(num_samples, n)

    # Use the inverse error function (probit function) to transform 'u' into standard normal samples
    z_standard_normal = np.sqrt(2) * erfinv(2 * u - 1)

    # Apply the Cholesky decomposition to the covariance matrix
    L = cholesky(covariance, lower=True)

    # Transform standard normal samples into multivariate Gaussian samples
    samples = np.dot(z_standard_normal, L.T) + mean

    return samples

# Example usage:
# mean_vector = np.array([0.0, 0.0])
# covariance_matrix = np.array([[1.0, 0.5], [0.5, 1.0]])
# generated_samples = sample_multivariate_gaussian(mean_vector, covariance_matrix, num_samples=100)

Fitting the single (2-D) gaussina distribution to our data.¶

in our case, the best fitting parameter would be the mean vector and the covariance matrix of the given dataset

In [ ]:
def fit_gaussian_to_data(X):
    # Calculate the mean and covariance matrix
    mean = np.mean(X, axis=0)
    cov_matrix = np.cov(X, rowvar=False)

    return (mean, cov_matrix)
In [ ]:
fitted_params={}
fitted_params['Small']=fit_gaussian_to_data(datasets["Small"])
fitted_params['Medium']=fit_gaussian_to_data(datasets["Medium"])
fitted_params['Large']=fit_gaussian_to_data(datasets["Large"])
In [ ]:
sample_small=sample_multivariate_gaussian(fitted_params['Small'][0], fitted_params['Small'][1],num_samples=100)
sample_medium=sample_multivariate_gaussian(fitted_params['Small'][0], fitted_params['Small'][1],num_samples=100)
sample_high=sample_multivariate_gaussian(fitted_params['Small'][0], fitted_params['Small'][1],num_samples=100)
In [ ]:
# Plot
plt.figure(figsize=(8, 6))
plt.scatter(datasets['Small'][:,0], datasets['Small'][:, 1], c='blue', label='make_moons dataset: small', alpha=0.5)
plt.scatter(sample_small[:,0],sample_small[:,1],c='red',label="sampled",alpha=0.5)
plt.title("gaussian fitted to 100 datapoints")
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(datasets['Medium'][:,0], datasets['Medium'][:, 1], c='blue', label='make_moons dataset: medium', alpha=0.5)
plt.scatter(sample_medium[:,0],sample_medium[:,1],c='red',label="sampled",alpha=0.5)
plt.title("gaussian fitted to 500 datapoints")
plt.legend()
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(datasets['Large'][:,0], datasets['Large'][:, 1], c='blue', label='make_moons dataset: large', alpha=0.5)
plt.scatter(sample_high[:,0],sample_high[:,1],c='red',label="sampled",alpha=0.5)
plt.title("gaussian fitted to 1000 datapoints")
plt.legend()
plt.show()
Comment

Both the codes (sample and ours) achieve the task of defining a Gaussian distribution class, fitting it to data, and generating samples. However, the most important difference between the two is the way samples are generated:

  • sample code: It uses eigenvalue decomposition to transform samples from a standard normal distribution to match the covariance matrix of the Gaussian distribution. It follows a more complex but mathematically sound method for generating samples that respect the covariance structure.

  • Our code: It uses the Cholesky decomposition method for generating samples from a multivariate Gaussian distribution. Cholesky decomposition is a standard and efficient method for generating correlated multivariate normal samples.

If simplicity and widely accepted methods are a priority, our code might be favored. If a more advanced method with eigenvalue decomposition is preferred, the sample solution is a good choice.

As far as the way code is implemented is concerned, we have followed more of a modular appraoch to implement our code (unlike the sample solution, which is more suitable for use in scikit-learn pipelines.)

1.3 Gaussian mixture model (GMM)¶

In [ ]:
def gaussian_mixture_model(x_input:np.array, mixing_coeffs:np.array, mean_array:np.array, covs:Dict):
  '''
  Args:
  1. x_input: x vector at which one wants to evaluate the GMM model.
              A numpy array (row vector) of shape (1 X dimn of the problem)
  2. mean:  A numpy arrray of shape (L X dimn of the problem instance)
  3. mixing coeffs: a numpy array of shape (1 X L)
  4. covs: dict of covaraince matrices, L number of entries, one for each of the P_l(x).
  '''
  L=np.shape(mean_array)[0]
  dimn_of_the_prob=np.shape(x_input)[1]
  list_gmm=[mixing_coeffs[0,idx_el] * gaussian_multivar(x=x_input, mean=mean_array[idx_el,:],cov_mat=covs[idx_el])
            for idx_el in range(0,L)
            ]
  prob=sum(list_gmm)## sometimes instead of 1./num_data , literature uses the factor 1./(bandwidth X num_data)
  return prob

EM algorithm for training the params for the GMM¶

In [ ]:
def EM_training(mean:np.array,mixing_coeffs:np.array,covs:Dict, dataset_input:np.array,num_iters:int=500,tol:float=0.0001):
  '''
  Args:
  mean= a numpy array
        shape(mean)= (L X dimension_of_the_problem_instance (whether 1D, 2D, 3D and so on))
  mixing_coeffs=
        shape(mixing_coeffs)= (1 X L)
  covs= dict of covariance matrices.
        shape(covs)= L number of entries, one for each of the P_l(x)
  dataset_input: shape would be (num of data set X dimn_of_the_prob)
  '''
  L=np.shape(mean)[0]
  print("L is:");print(L)
  dimn_of_prob=np.shape(dataset_input)[1]
  # define gamma: responsibility
  num_data=np.shape(dataset_input)[0] ### note shape(dataset_input)=(num data X dimn_of_the_problem)
  gamma=np.zeros(shape=(num_data,L))
  prev_log_likelihood = -np.inf  # Initialize the log-likelihood to negative infinity
  ###
  for t in range(0,num_iters):

    ### evaluate the E step
    # update the gamma.
    for idx_over_data in range(0,num_data):
      prod_mixing_coeffs_and_gaussian_l=[mixing_coeffs[0,idx_el]*gaussian_multivar(x=dataset_input[idx_over_data,:].reshape((1,dimn_of_prob)),mean=mean[idx_el,:].reshape((1,dimn_of_prob)),cov_mat=covs[idx_el]) for idx_el in range(0,L)]
      Dr_for_gamma=np.sum(prod_mixing_coeffs_and_gaussian_l)
      gamma[idx_over_data,:]=[prod_mixing_coeffs_and_gaussian_l[k]/Dr_for_gamma for k in range(0,L)]

    #print("gamma obtained:");print(gamma)
    ### M-step: Re-estimating the parameters
    N_l=np.sum(gamma,axis=0)# this is the expression sum_{i=1}^{N(all dataset)} gamma_{il} in the lecture notes
    #print("N_l is:");print(N_l)# NOTE:: shape of N_l = (L,) (note that this array is 1D)

    ### 1. update means: the expresion is like a wtd sum of the vectors
    for idx_el in range(0,L):
      #print("index el is:");print(idx_el)
      temp_vec=np.array(
          [gamma[idx_over_data,idx_el]*dataset_input[idx_over_data,:]
           for idx_over_data in range(0,num_data)]
          )
      mean[idx_el,:]=(1./N_l[idx_el])*np.sum(temp_vec,axis=0)
    #print("mean matrix after update:");print(mean)

    ### 2. update the covariance matrix:
    ##### formula for updating the cov mat is:
    ##### sigma_lth=1/N_l[idx_el] * sum_{i=1}^{N(all_dataset)} * gamma_{il}*some_outer_product
    for idx_el in range(0,L):
      #print("updating cov matrix, index el is:",idx_el)
      temp_outer_prod_list=np.array(
          [gamma[idx_over_data,idx_el]*(dataset_input[idx_over_data,:]-mean[idx_el,:]).reshape((dimn_of_prob,1)) @
           (dataset_input[idx_over_data,:]-mean[idx_el,:]).reshape((1,dimn_of_prob))
          for idx_over_data in range(0,num_data)
         ]
          )
      covs[idx_el]=(1./N_l[idx_el])*sum(temp_outer_prod_list)
    #print("updated covriances");print(covs)

    ### 3. update the mixing coeffs:
    for idx_el in range(0,L):
      mixing_coeffs[0,idx_el]=N_l[idx_el]/num_data

    # Calculate the log-likelihood for the current parameters
    current_log_likelihood = 0
    for idx_over_data in range(num_data):
        log_sum = np.log(sum([mixing_coeffs[0, idx_el] * gaussian_multivar(x=dataset_input[idx_over_data, :].reshape((1, dimn_of_prob)), mean=mean[idx_el, :].reshape((1, dimn_of_prob)), cov_mat=covs[idx_el]) for idx_el in range(L)]))
        current_log_likelihood += log_sum

    # Check for convergence
    if np.abs(current_log_likelihood - prev_log_likelihood) < tol:
        break

    prev_log_likelihood = current_log_likelihood

  return mean,covs,mixing_coeffs
In [ ]:
def fit_the_GMM(X, L):

  # Define your Gaussian function for multivariate data
  def gaussian_multivar(x, mean, cov_mat):
    dimn = np.shape(x)[1]
    inv_cov = np.linalg.inv(cov_mat)
    x_minus_mu = x - mean
    arg_for_expn = 0.5 * np.sum(x_minus_mu @ inv_cov * x_minus_mu, axis=1)
    det_cov = np.linalg.det(cov_mat)
    prob = np.exp(-arg_for_expn) / (np.sqrt(det_cov) * ((2 * np.pi) ** (dimn / 2)))
    return prob

  dimn_of_prob = X.shape[1]

  # Initial guess for GMM parameters
  initial_means = np.random.rand(L, dimn_of_prob)
  initial_covs = [np.eye(dimn_of_prob) for _ in range(L)]
  initial_mixing_coeffs = np.random.rand(1, L)
  initial_mixing_coeffs /= np.sum(initial_mixing_coeffs)  # Normalize

  # Create a dictionary to store covariance matrices
  covariance_matrices = {i: initial_covs[i] for i in range(L)}

  # Train the GMM model using the EM algorithm
  trained_means, trained_covs, trained_mixing_coeffs = EM_training(
      mean=initial_means,
      mixing_coeffs=initial_mixing_coeffs,
      covs=covariance_matrices,
      dataset_input=X,
      num_iters=100,
      tol=0.0001
  )

  print("trained means are:"); print(trained_means)
  print("trained_covs are:");print(trained_covs)
  print("trained_mixing_coeffs are:");print(trained_mixing_coeffs)

  # Plot the original data
  plt.figure(figsize=(8, 6))
  plt.scatter(X[:, 0], X[:, 1], c='blue', label='make_moons dataset', alpha=0.5)


  # Plot the GMM model
  ###############################
  x, y = np.meshgrid(np.linspace(X[:, 0].min(), X[:, 0].max(), 100), np.linspace(X[:, 1].min(), X[:, 1].max(), 100))
  xy = np.column_stack([x.ravel(), y.ravel()])
  pdf_values = np.zeros_like(x)
  for idx in range(L):
      #component_pdf = trained_mixing_coeffs[0, idx] * multivariate_normal.pdf(xy, trained_means[idx, :], trained_covs[idx])
      component_pdf = trained_mixing_coeffs[0, idx] * gaussian_multivar(xy, trained_means[idx, :], trained_covs[idx])
      pdf_values += component_pdf.reshape(x.shape)

  # Create a contour plot for the GMM model with a color bar
  contour = plt.contourf(x, y, pdf_values, levels=10, cmap='viridis', alpha=0.5)
  colorbar = plt.colorbar(contour, label='Probability Density')

  plt.xlabel('Feature 1')
  plt.ylabel('Feature 2')
  plt.legend()
  plt.title('GMM Model and make_moons Dataset')
  plt.show()

  #plot_contour_plot_gmm(X,trained_mixing_coeffs,trained_means,trained_covs,L)



  return trained_means, trained_covs, trained_mixing_coeffs

We employed following strategy to sample from the GMM model:¶

In a Gaussian Mixture Model (GMM) the multiple Gaussian components represents a cluster or mode in your data. The "mixing coefficients" indicate the probability of choosing each component when generating data:

Step 1. Component Selection: First, one needs to select one of the Gaussian components to generate a data point. We do this by sampling from the mixing coefficients. For example, if we have two components with mixing coefficients [0.4, 0.6], we'd choose the first component with a 40% chance and the second component with a 60% chance.

Step.2. Sample from the Selected Component: Once we've selected a component, we sample a data point from that component's Gaussian distribution. This is done by generating random numbers according to the mean and covariance matrix of the selected component's Gaussian distribution.

Step 3: Repeat: We repeat these steps to generate as many data points as you need.

In [ ]:
# sample the data_points from the GMM model
def sample_from_gmm(num_samples, mixing_coeffs, mean_array, covs,X, want_scatter_plot=False):
  from scipy.stats import multivariate_normal

  samples = []
  for _ in range(num_samples):
      # Sample a component based on mixing coefficients
      component = np.random.choice(len(mixing_coeffs[0]), p=mixing_coeffs[0])

      # Sample from the selected component
      sample = np.random.multivariate_normal(mean_array[component, :], covs[component])
      samples.append(sample)
  samples=np.array(samples)

  if want_scatter_plot==True:
    plt.figure(2)
    plt.figure(figsize=(8, 6))
    plt.scatter(X[:, 0], X[:, 1], c='blue', label='make_moons dataset', alpha=0.5)
    plt.scatter(samples[:, 0], samples[:, 1], c='red', label='Generated Data', alpha=0.5)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.title('Generated Data from GMM Model')
    plt.show()

  return samples
Comment
  • From conceptual standpoint, both our and the sample soln's implementation for the GMM follow the same idea for (a) EM-algorithm for finding the best parameters and (b) sampling the dataset is concerned. However note that the sample solution doesnt explicitly perform the calculation of log-likelihood or the convergence check, something which we have implemented in our code as well.

However something that we should have added to our implementation is the regularisation term (the sample solution does it, they define it as epsilon) which is used when updating the covairance matrices in the M-step of the E-M algorithm. This regularisation term is added to covariance matrices to ensure that the matrix remains positive definite.

  • We could have made following improvements to make our code faster:
  1. Vectorization: We could have used NumPy functions for operations on arrays instead of explicit loops where possible. This can significantly speed up calculations.
# Instead of using explicit loops
temp_vec = np.array([gamma[idx_over_data, idx_el] * dataset_input[idx_over_data, :] for idx_over_data in range(0, num_data)])
mean[idx_el, :] = (1. / N_l[idx_el]) * np.sum(temp_vec, axis=0)

# Use NumPy functions for vectorized operations
temp_vec = gamma[:, idx_el][:, np.newaxis] * dataset_input
mean[idx_el, :] = np.sum(temp_vec, axis=0) / N_l[idx_el]
  1. Avoid list comprehension for large arrays: In our list comprehensions, we are creating lists and then summing them up. This can be memory-intensive for large datasets. Instead, we could have used the array operations directly.
# Instead of list comprehensions
temp_outer_prod_list = np.array([gamma[idx_over_data, idx_el] * (dataset_input[idx_over_data, :] - mean[idx_el, :]).reshape((dimn_of_prob, 1)) @
                                 (dataset_input[idx_over_data, :] - mean[idx_el, :]).reshape((1, dimn_of_prob)) for idx_over_data in range(0, num_data)])

# Use array operations directly
temp_diff = dataset_input - mean[idx_el, :]
temp_outer_prod_list = gamma[:, idx_el][:, np.newaxis, np.newaxis] * np.einsum('ij,ik->ijk', temp_diff, temp_diff)
covs[idx_el] = np.sum(temp_outer_prod_list, axis=0) / N_l[idx_el]
In [ ]:
list(datasets.keys())
Out[ ]:
['Small', 'Medium', 'Large']
In [ ]:
### fit the GMM model to the data and then sample

dataset_for_GMM={}; keys_dataset=list(datasets.keys())
for data_size in keys_dataset:
  print("----------------DATASIZE:",data_size,"------------------")
  print("_________________________________________________________")
  X=datasets[data_size];temp={}
  for idx_el in range(1,7):
    print("index_el:",idx_el)
    trained_means, trained_covs, trained_mixing_coeffs=fit_the_GMM(X=X, L=idx_el)

    generated_samples=sample_from_gmm(num_samples=X.shape[0],mixing_coeffs=trained_mixing_coeffs,
                                          mean_array=trained_means,
                                          covs=trained_covs,
                                      X=X, want_scatter_plot=True)
    temp[idx_el]=generated_samples
  dataset_for_GMM[data_size]=temp
----------------DATASIZE: Small ------------------
_________________________________________________________
index_el: 1
L is:
1
trained means are:
[[0.50006995 0.2464502 ]]
trained_covs are:
{0: array([[ 0.753472  , -0.17935397],
       [-0.17935397,  0.24466648]])}
trained_mixing_coeffs are:
[[1.]]
<Figure size 640x480 with 0 Axes>
index_el: 2
L is:
2
trained means are:
[[ 0.80769015  0.17751047]
 [-0.74562807  0.52561939]]
trained_covs are:
{0: array([[ 0.44948797, -0.13198018],
       [-0.13198018,  0.25881563]]), 1: array([[0.04948005, 0.06244601],
       [0.06244601, 0.09018868]])}
trained_mixing_coeffs are:
[[0.80195932 0.19804068]]
<Figure size 640x480 with 0 Axes>
index_el: 3
L is:
3
trained means are:
[[ 0.71459993  0.281603  ]
 [ 0.61840201  0.75452719]
 [ 0.40285576 -0.01647488]]
trained_covs are:
{0: array([[ 0.91569742, -0.24268738],
       [-0.24268738,  0.1313912 ]]), 1: array([[ 0.60645975, -0.18408576],
       [-0.18408576,  0.06919984]]), 2: array([[ 0.77493737, -0.22318156],
       [-0.22318156,  0.15328593]])}
trained_mixing_coeffs are:
[[0.10380025 0.30088703 0.59531273]]
<Figure size 640x480 with 0 Axes>
index_el: 4
L is:
4
trained means are:
[[-0.74373538  0.52724569]
 [ 0.49212539 -0.00998175]
 [ 0.78034677  0.723156  ]
 [ 1.50025431 -0.31479398]]
trained_covs are:
{0: array([[0.04925169, 0.06251597],
       [0.06251597, 0.09093562]]), 1: array([[ 0.14582385, -0.03125686],
       [-0.03125686,  0.12055593]]), 2: array([[ 0.57467376, -0.19671174],
       [-0.19671174,  0.07751427]]), 3: array([[0.08423101, 0.04730756],
       [0.04730756, 0.03251709]])}
trained_mixing_coeffs are:
[[0.2055128  0.33860229 0.27456441 0.18132049]]
<Figure size 640x480 with 0 Axes>
index_el: 5
L is:
5
trained means are:
[[ 0.69253483 -0.09092805]
 [ 0.4064456   0.57350401]
 [ 1.81651015  0.01400419]
 [-0.70462613  0.60873619]
 [-0.57022529  0.47183015]]
trained_covs are:
{0: array([[ 0.1882859 , -0.07826564],
       [-0.07826564,  0.18301614]]), 1: array([[ 0.11998602, -0.02308836],
       [-0.02308836,  0.17391539]]), 2: array([[0.02825188, 0.04299729],
       [0.04299729, 0.07831436]]), 3: array([[0.05003109, 0.05501902],
       [0.05501902, 0.06405516]]), 4: array([[0.18140737, 0.19641821],
       [0.19641821, 0.21639144]])}
trained_mixing_coeffs are:
[[0.34796375 0.24753802 0.17313393 0.17922193 0.05214237]]
<Figure size 640x480 with 0 Axes>
index_el: 6
L is:
6
trained means are:
[[-0.73627212  0.53576089]
 [ 0.04559832  0.22281345]
 [ 0.65519663  0.6311833 ]
 [ 1.78714662 -0.01732783]
 [ 0.71546059 -0.38751407]
 [-0.10457785  0.97105925]]
trained_covs are:
{0: array([[0.05094303, 0.06435182],
       [0.06435182, 0.09268741]]), 1: array([[ 0.00268605, -0.00784069],
       [-0.00784069,  0.03525823]]), 2: array([[ 0.09458215, -0.09117792],
       [-0.09117792,  0.10940841]]), 3: array([[0.03685224, 0.0502607 ],
       [0.0502607 , 0.08314315]]), 4: array([[ 0.12809965, -0.04311216],
       [-0.04311216,  0.01943504]]), 5: array([[0.0019856 , 0.00160915],
       [0.00160915, 0.00448503]])}
trained_mixing_coeffs are:
[[0.21003668 0.09719611 0.2520742  0.18888608 0.21205136 0.03975557]]
<Figure size 640x480 with 0 Axes>
----------------DATASIZE: Medium ------------------
_________________________________________________________
index_el: 1
L is:
1
trained means are:
[[0.49985466 0.24700133]]
trained_covs are:
{0: array([[ 0.74904345, -0.19119175],
       [-0.19119175,  0.2433779 ]])}
trained_mixing_coeffs are:
[[1.]]
<Figure size 640x480 with 0 Axes>
index_el: 2
L is:
2
trained means are:
[[0.44917222 0.04491844]
 [0.62286977 0.73749168]]
trained_covs are:
{0: array([[ 0.81234166, -0.228497  ],
       [-0.228497  ,  0.17381113]]), 1: array([[ 0.57404025, -0.18584238],
       [-0.18584238,  0.07252806]])}
trained_mixing_coeffs are:
[[0.7082144 0.2917856]]
<Figure size 640x480 with 0 Axes>
index_el: 3
L is:
3
trained means are:
[[-0.7737834   0.51975744]
 [ 1.76511022 -0.04300051]
 [ 0.49330375  0.25437384]]
trained_covs are:
{0: array([[0.040528  , 0.05267169],
       [0.05267169, 0.08300167]]), 1: array([[0.04383146, 0.0543914 ],
       [0.0543914 , 0.08102181]]), 2: array([[ 0.18834427, -0.12138886],
       [-0.12138886,  0.29419402]])}
trained_mixing_coeffs are:
[[0.18909945 0.19354863 0.61735192]]
<Figure size 640x480 with 0 Axes>
index_el: 4
L is:
4
trained means are:
[[ 0.48864373 -0.18266959]
 [-0.69458279  0.58416462]
 [ 1.75212006 -0.0556088 ]
 [ 0.57128862  0.66601923]]
trained_covs are:
{0: array([[ 0.17184121, -0.11440647],
       [-0.11440647,  0.10408415]]), 1: array([[0.06983778, 0.07260714],
       [0.07260714, 0.09321453]]), 2: array([[0.04731459, 0.05721665],
       [0.05721665, 0.08278782]]), 3: array([[ 0.13389047, -0.10183705],
       [-0.10183705,  0.09826414]])}
trained_mixing_coeffs are:
[[0.30155874 0.22387285 0.20060659 0.27396182]]
<Figure size 640x480 with 0 Axes>
index_el: 5
L is:
5
trained means are:
[[ 0.8985737  -0.45494526]
 [ 1.00047115  0.27866819]
 [ 0.26990034  0.8767014 ]
 [-0.77334284  0.52525856]
 [ 1.00370896 -0.17079393]]
trained_covs are:
{0: array([[ 0.09583784, -0.01040648],
       [-0.01040648,  0.00459743]]), 1: array([[ 0.61772248, -0.02023903],
       [-0.02023903,  0.02871827]]), 2: array([[ 0.12956334, -0.03827076],
       [-0.03827076,  0.01906512]]), 3: array([[0.03871495, 0.05015285],
       [0.05015285, 0.0782377 ]]), 4: array([[ 0.50148919, -0.03404347],
       [-0.03404347,  0.02566867]])}
trained_mixing_coeffs are:
[[0.15765052 0.24341738 0.21396886 0.18540784 0.1995554 ]]
<Figure size 640x480 with 0 Axes>
index_el: 6
L is:
6
trained means are:
[[-0.79592265  0.49642325]
 [ 0.82699921  0.4705127 ]
 [ 0.04192508  0.95596168]
 [ 1.81433009  0.00769911]
 [ 0.9882576  -0.46320741]
 [ 0.18051161  0.02682711]]
trained_covs are:
{0: array([[0.03271249, 0.04497008],
       [0.04497008, 0.07611623]]), 1: array([[ 0.02727174, -0.03769265],
       [-0.03769265,  0.06601615]]), 2: array([[ 0.0738626 , -0.00207535],
       [-0.00207535,  0.00401852]]), 3: array([[0.02719964, 0.038613  ],
       [0.038613  , 0.06770735]]), 4: array([[ 0.08176539, -0.00181037],
       [-0.00181037,  0.00338919]]), 5: array([[ 0.02878055, -0.04141316],
       [-0.04141316,  0.07297738]])}
trained_mixing_coeffs are:
[[0.18017456 0.16739798 0.15220133 0.17229991 0.15677852 0.17114771]]
<Figure size 640x480 with 0 Axes>
----------------DATASIZE: Large ------------------
_________________________________________________________
index_el: 1
L is:
1
trained means are:
[[0.5006108  0.24937081]]
trained_covs are:
{0: array([[ 0.75409776, -0.19372119],
       [-0.19372119,  0.24760576]])}
trained_mixing_coeffs are:
[[1.]]
<Figure size 640x480 with 0 Axes>
index_el: 2
L is:
2
trained means are:
[[ 0.62829721  0.09056855]
 [-0.0507244   0.93506067]]
trained_covs are:
{0: array([[ 0.81228134, -0.13225792],
       [-0.13225792,  0.16964158]]), 1: array([[0.12849924, 0.00648573],
       [0.00648573, 0.00518643]])}
trained_mixing_coeffs are:
[[0.81195531 0.18804469]]
<Figure size 640x480 with 0 Axes>
index_el: 3
L is:
3
trained means are:
[[ 1.74385897 -0.06299622]
 [-0.76689092  0.54119564]
 [ 0.48653496  0.26148846]]
trained_covs are:
{0: array([[0.05194519, 0.06166023],
       [0.06166023, 0.09103468]]), 1: array([[0.04516723, 0.05632036],
       [0.05632036, 0.08569044]]), 2: array([[ 0.17811734, -0.10976195],
       [-0.10976195,  0.29223171]])}
trained_mixing_coeffs are:
[[0.20444224 0.19384816 0.60170959]]
<Figure size 640x480 with 0 Axes>
index_el: 4
L is:
4
trained means are:
[[ 1.81235615  0.00383034]
 [ 0.21044536  0.12828942]
 [ 0.77699853  0.37363025]
 [-0.81836973  0.48956207]]
trained_covs are:
{0: array([[0.0286931 , 0.04150261],
       [0.04150261, 0.07661638]]), 1: array([[ 0.14192445, -0.18107452],
       [-0.18107452,  0.28678854]]), 2: array([[ 0.14993125, -0.18741391],
       [-0.18741391,  0.29675392]]), 3: array([[0.02890473, 0.04178497],
       [0.04178497, 0.07497171]])}
trained_mixing_coeffs are:
[[0.17381666 0.32509748 0.33048881 0.17059706]]
<Figure size 640x480 with 0 Axes>
index_el: 5
L is:
5
trained means are:
[[ 1.81236186  0.00516478]
 [ 0.619187    0.64041602]
 [-0.76123566  0.54781842]
 [ 0.08324945  0.24193606]
 [ 0.93948828 -0.45298527]]
trained_covs are:
{0: array([[0.02838007, 0.04070096],
       [0.04070096, 0.07516323]]), 1: array([[ 0.11794533, -0.10043192],
       [-0.10043192,  0.10747158]]), 2: array([[0.04568965, 0.0567913 ],
       [0.0567913 , 0.08589983]]), 3: array([[ 0.03743876, -0.07661123],
       [-0.07661123,  0.19122726]]), 4: array([[ 0.09528433, -0.00751979],
       [-0.00751979,  0.00424314]])}
trained_mixing_coeffs are:
[[0.17411171 0.26193946 0.19831888 0.19815708 0.16747287]]
<Figure size 640x480 with 0 Axes>
index_el: 6
L is:
6
trained means are:
[[ 0.26756185 -0.07134769]
 [-0.86519315  0.42787887]
 [ 1.68143494 -0.11412851]
 [ 0.94456531 -0.05292318]
 [ 0.58870719  0.76962196]
 [-0.19904052  0.94382425]]
trained_covs are:
{0: array([[ 0.05511159, -0.06300095],
       [-0.06300095,  0.0884764 ]]), 1: array([[0.01717837, 0.02795799],
       [0.02795799, 0.06008518]]), 2: array([[0.0742036 , 0.07681313],
       [0.07681313, 0.09843615]]), 3: array([[0.0076563 , 0.00114038],
       [0.00114038, 0.14426678]]), 4: array([[ 0.04412181, -0.03133136],
       [-0.03133136,  0.02659675]]), 5: array([[0.06481334, 0.01158746],
       [0.01158746, 0.00533173]])}
trained_mixing_coeffs are:
[[0.21219401 0.14627734 0.23401158 0.13300356 0.13446874 0.14004476]]
<Figure size 640x480 with 0 Axes>
Comment

Both sample code and our code implements the similar concepts, hence we believe that they are essentially same and should generate similar results.

The sample solution considers L=20 directly for fitting the gaussian distribution hence it obviously gives a better answer than ours (we expect similar result for L=20 in our case). However in our implementaiton, my taking values of L=1,2,3,4,5,6, we have explicitly shown, both via contour plots (this is missing in sample soln) as well as through MMD_squared caln which we have carried out below (this sample solution also shows), how increasing the value of L (having more mixer terms) improves the model.

In [ ]:
# with open('GMM_dataset_dictionary.pkl', 'wb') as f:
#     pickle.dump(dataset_for_GMM, f)

Observations:

  1. For the given data-set, increasing the number of mixers in our GMM model helps in learning the distribution, something we would expect as the dataset is clearly not uni-modal so to speak.

  2. As can clearly be seen, for a particular L (number of mixer components in the GMM), more the number of dataset, better is the estimate of the probability. With larger dataset, a GMM model of low value of L (less number of mixtures) is able to model the distribution as good as a GMM model with large l trained over low number of data-points.

This can also be seen more explicitly from the plot of $mmd^{2}$ for different L for each dataset of different sizes ('small':100 data points, 'medium':500 data points, 'high':1000 datapoints) plotted below!

1.4 Kernel density estimation.¶

Note that the kernel density estimation model is a non-parametric model, hence it does not require any "training" so to speak!

  • discusses how to choose an appropriate bandwidth parameter: https://stats.libretexts.org/Bookshelves/Computing_and_Modeling/Supplemental_Modules_(Computing_and_Modeling)/Regression_Analysis/Nonparametric_Inference_-_Kernel_Density_Estimation#:~:text=Contributors-,Introduction%20and%20definition,%E2%88%88%CE%98%E2%8A%82Rd%7D.

  • Squared exponential kernel:

https://www.mathworks.com/help/stats/kernel-covariance-function-options.html

In [ ]:
def squared_exp_kernel(xa:np.array, xb:np.array,bandwidth:float,amplitude:float=1):
  '''
  xa=row vector of shape (1xdimn_of_the_prob)
  xb= row vector of shape (1X dimn_of_the_prob)
  bandwidth: a hyperparameter
  amplitude: overall variance (set to 1 in the lecture, can be changed as well)
  '''
  diff=xa-xb
  kernel=(amplitude**2) *  np.exp( -np.dot(diff,diff)/(2*bandwidth) )### what we are calling bandwidth is also refered to as sigma^2 is some of the literature. Here we have followed the notation followed in the lecture
  return kernel

def inverse_multi_quadratic_kernel(xa:np.array, xb:np.array,bandwidth:float):
  '''
  xa=row vector of shape (1xdimn_of_the_prob)
  xb= row vector of shape (1X dimn_of_the_prob)
  bandwidth: a hyperparameter
  '''
  diff=xa-xb
  kernel=bandwidth/(bandwidth+ ((norm(diff))**2) )
  return kernel

def kernel_density_estimator(x:np.array, dataset:np.array, bandwidth:float, kernel_fn):
  '''
  x: a row vector (1X dimn of the problem) at which one wants to evaluate the KDE (prob function)
  dataset= an array of shape (N X dimn of the problem)
  bandwidth: hyperparameter
  kernel_fn: choice of the user
  '''
  num_data=np.shape(dataset)[0]
  list_kernel=[kernel_fn(xa=x,xb=dataset[j,:],bandwidth=bandwidth) for j in range(0,num_data)]
  prob=(1./num_data)*sum(list_kernel)## sometimes instead of 1./num_data , we also have 1./(bandwidth X num_data)
  return prob
In [ ]:
def estimate_bandwidth_with_silverman_rule(data):
    # Step 1: Calculate the standard deviation
    std_dev = np.std(data)

    # Step 2: Calculate the interquartile range (IQR)
    Q1 = np.percentile(data, 25, axis=0)
    Q3 = np.percentile(data, 75, axis=0)
    IQR = Q3 - Q1

    # Step 3: Estimate the bandwidth using Silverman's rule of thumb
    n = len(data)
    bandwidth = 0.9 * min(std_dev, np.mean(IQR) / 1.34) * n ** (-1 / 5)

    return bandwidth
In [ ]:
# contour plot kde
def contour_plot_kde(X,bandwidth,xmin=-2,xmax=2,ymin=-1.5,ymax=1.5):

  # # Create a grid of points within the specified range
  x_grid = np.linspace(xmin, xmax, 100)
  y_grid = np.linspace(ymin, ymax, 100)
  X_grid, Y_grid = np.meshgrid(x_grid, y_grid)
  points = np.c_[X_grid.ravel(), Y_grid.ravel()]

  # Evaluate the KDE at each point in the grid
  kde_values = np.array([kernel_density_estimator(point, X, bandwidth,kernel_fn=squared_exp_kernel) for point in points])
  kde_values = kde_values.reshape(X_grid.shape)

  # Create a contour plot
  plt.figure(figsize=(8, 6))
  plt.contourf(X_grid, Y_grid, kde_values, levels=100, cmap='viridis')
  plt.scatter(X[:, 0], X[:, 1], c='blue', label='make_moons dataset', alpha=0.5)
  plt.colorbar(label='Probability Density')
  plt.xlabel('Feature 1')
  plt.ylabel('Feature 2')
  plt.title('Scatter Plot for data and Contour Plot of KDE Probability Density')
  plt.legend()
  plt.show()

# contour plot for distribution from the KDE
# xmin, xmax = -1, 2  # Adjust the range as needed
# ymin, ymax = -1, 1  # Adjust the range as needed
# # bandwidth =  estimate_bandwidth_with_silverman_rule(X)
# # print("Estimated bandwidth using silverman rule:", bandwidth)
# contour_plot_kde(X,bandwidth=0.1,xmin=-1,xmax=2,ymin=-1,ymax=1)
In [ ]:
### Sampling from the KDE
def sample_from_kde(data, bandwidth, num_samples,want_scatter_plot=False):
  samples = []
  for _ in range(num_samples):
      # Randomly select a data point
      data_point = data[np.random.randint(len(data))]

      # Generate random perturbations based on the Gaussian kernel
      perturbations = np.random.normal(0, bandwidth, size=data_point.shape)

      # Add the perturbations to the selected data point
      sample = data_point + perturbations
      samples.append(sample)

  samples=np.array(samples)

  if want_scatter_plot==True:
    # Sample from the KDE
    # visualise the original and the generated data
    plt.figure(figsize=(8,6))
    plt.scatter(X[:, 0], X[:, 1,], c='b', label='Original Data')
    plt.scatter(samples[:, 0], samples[:, 1,], c='r', label='Generated Data')
    plt.legend()
    plt.tight_layout()
    plt.show()

  return samples
Comment
  1. Implementation of concept: Both sample solution and our solution have implemented the concept of the KDE correctly. However our code provides more flexibility by allowing us to choose different kernel function, hence providing the KDE to be tailored to specific characterisitcs of the data. As far as sampling is concerned, both codes implements the same strategy of "introducing the random perturbations to a randomlyselected data point from the original dataset".

  2. We can make following improvements to make our code fast:

    2.1. Vectorization: We could have utilized the NumPy vectorized operations instead of explicit loops as they are usually more efficient.

    # Instead of looping through data points
    for i in range(num_samples):
        data_point = data[np.random.randint(len(data))]
        perturbations = np.random.normal(0, bandwidth, size=data_point.shape)
        sample = data_point + perturbations
    
    # Use vectorized operations
    data_points = data[np.random.randint(len(data), size=num_samples)]
    perturbations = np.random.normal(0, bandwidth, size=(num_samples, data.shape[1]))
    samples = data_points + perturbations
    

    2.2 Precompute Random Perturbations: If the same perturbations are used for all samples, precompute them to avoid redundant computations inside the loop.

    
    # Instead of 
    for i in range(num_samples):
    data_point = data[np.random.randint(len(data))]
    perturbations = np.random.normal(0, bandwidth, size=data_point.shape)
    sample = data_point + perturbations
    
    # Use Precomputed perturbations
    perturbations = np.random.normal(0, bandwidth, size=(num_samples, data.shape[1]))
    
    # Inside the loop
    for i in range(num_samples):
        data_point = data[np.random.randint(len(data))]
        sample = data_point + perturbations[i]
    
    
  3. We believe our solutions (the cases (hyperparameter) that we considered to explore the problem) provides more insight into how the KDE behaves. We have considered both the appropriate value of bandwidth (around 0.1) as well as others. We have also explicitly shown through contour plots how the prob distn generated by the KDE behaves for different values of the bandwidth (this is not done in the sample solution).

In [ ]:
dataset_kde={};bandwidth=[0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4];keys_dataset=list(datasets.keys())
keys_dataset=list(datasets.keys())
for data_size in keys_dataset:
  print("--------data_size:",data_size)
  X=datasets[data_size];temp={}
  for bw in bandwidth:
    print("bandwidth:",bw)
    contour_plot_kde(X,bandwidth=bw,xmin=-2,xmax=2.5,ymin=-1.5,ymax=1.5)
    generated_data_KDE=sample_from_kde(data=X,bandwidth=bw,num_samples=X.shape[0],want_scatter_plot=True)
    temp[bw]=generated_data_KDE
  dataset_kde[data_size]=temp
--------data_size: Small
bandwidth: 0.01
bandwidth: 0.05
bandwidth: 0.1
bandwidth: 0.15
bandwidth: 0.2
bandwidth: 0.25
bandwidth: 0.3
bandwidth: 0.35
bandwidth: 0.4
--------data_size: Medium
bandwidth: 0.01
bandwidth: 0.05
bandwidth: 0.1
bandwidth: 0.15
bandwidth: 0.2
bandwidth: 0.25
bandwidth: 0.3
bandwidth: 0.35
bandwidth: 0.4
--------data_size: Large
bandwidth: 0.01
bandwidth: 0.05
bandwidth: 0.1
bandwidth: 0.15
bandwidth: 0.2
bandwidth: 0.25
bandwidth: 0.3
bandwidth: 0.35
bandwidth: 0.4
In [ ]:
# with open('KDE_dataset_dictionary_smaller_bw_range.pkl', 'wb') as f:
#     pickle.dump(dataset_kde, f)

Implementation of MMD metric:¶

a good resource for MMD: https://www.onurtunali.com/ml/2019/03/08/maximum-mean-discrepancy-in-machine-learning.html

In [ ]:
import numpy as np

def estimate_mmd_squared(samples1, samples2, kernel_function, bandwidth=1.0):
    """
    Estimate the Maximum Mean Discrepancy (MMD) squared between two sets of samples using a kernel function.

    Args:
    - samples1: the first set of samples.
                A numpy array of shape (n_samples1, n_features) representing .
    - samples2: the second set of samples.
                A numpy array of shape (n_samples2, n_features) representing .
    - kernel_function: A function that computes the kernel value between two samples.
    - bandwidth: Bandwidth parameter for the kernel function.

    Returns:
    - mmd_squared: The estimated MMD squared.
    """
    num_samples1 = samples1.shape[0]
    num_samples2 = samples2.shape[0]

    # Compute the MMD squared statistic
    mmd_squared1,mmd_squared2,mmd_squared12 = 0.0,0.0,0.0

    for i in range(num_samples1):
        for j in range(num_samples1):
            mmd_squared1 += kernel_function(samples1[i], samples1[j], bandwidth)
    mmd_squared1 *= 1.0 / (num_samples1 * (num_samples1 - 1))

    for i in range(num_samples2):
        for j in range(num_samples2):
            mmd_squared2 += kernel_function(samples2[i], samples2[j], bandwidth)
    mmd_squared2 *= 1.0 / (num_samples2 * (num_samples2 - 1))

    for i in range(num_samples1):
        for j in range(num_samples2):
            mmd_squared12 -= 2.0 * kernel_function(samples1[i], samples2[j], bandwidth)
    mmd_squared12 *= 1.0 / (num_samples1 * (num_samples2))

    mmd_squared=mmd_squared1+mmd_squared2+mmd_squared12


    return mmd_squared

def squared_exp_kernel(xa, xb, bandwidth, amplitude=1.0):
    """
    Squared Exponential (RBF) kernel function.

    Args:
    - xa: A numpy array of shape (1, dimn_of_the_prob).
    - xb: A numpy array of shape (1, dimn_of_the_prob).
    - bandwidth: Bandwidth parameter.
    - amplitude: Overall variance (default is 1).

    Returns:
    - kernel: The kernel value.
    """
    diff = xa - xb
    kernel = (amplitude ** 2) * np.exp(-np.dot(diff, diff) / (2 * bandwidth))
    return kernel

def inverse_multi_quadratic_kernel(xa, xb, bandwidth):
    """
    Inverse Multi-Quadratic kernel function.

    Args:
    - xa: A numpy array of shape (1, dimn_of_the_prob).
    - xb: A numpy array of shape (1, dimn_of_the_prob).
    - bandwidth: Bandwidth parameter.

    Returns:
    - kernel: The kernel value.
    """
    diff = xa - xb
    kernel = bandwidth / (bandwidth + np.sum(diff ** 2))
    return kernel
In [ ]:
def plot_dict_of_dicts(data, x_label='X-values', y_label='Y-values', title='Two-Line Point Plot'):
    """
    Plot a dictionary of dictionaries as separate lines on a two-line point plot.

    Args:
        data (dict): A dictionary where keys represent labels and values are dictionaries with x and y values.
        x_label (str): Label for the x-axis.
        y_label (str): Label for the y-axis.
        title (str): Title for the plot.

    Returns:
        None
    """
    plt.figure(figsize=(8, 6))

    for label, values in data.items():
        x_values = list(values.keys())
        y_values = list(values.values())
        plt.plot(x_values, y_values,":" ,marker='o', label=label)

    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

Using squared exponential kernel to estimate the MMD.¶

evaluating mmds for GMMs using squared exponential kernel¶

In [ ]:
size_dataset=["Small","Medium","Large"];mmd_gmm_model_sqd_exp_kernel={}
for data_size in size_dataset:
  print("----------------DATASIZE:",data_size,"------------------")
  X=datasets[data_size];
  temp={}
  for idx_el in range(1,7):
    dataset_sampled_gmm=dataset_for_GMM[data_size][idx_el]
    temp[idx_el]=estimate_mmd_squared(samples1=dataset_sampled_gmm,
                                           samples2=X, kernel_function=squared_exp_kernel)
    print("for L=",idx_el," MMD^2 score:");print(temp[idx_el])
  mmd_gmm_model_sqd_exp_kernel[data_size]=temp
----------------DATASIZE: Small ------------------
for L= 1  MMD^2 score:
0.021462996536169454
for L= 2  MMD^2 score:
0.015153576109734779
for L= 3  MMD^2 score:
0.023083391802166675
for L= 4  MMD^2 score:
0.0157404628433937
for L= 5  MMD^2 score:
0.012968477216233731
for L= 6  MMD^2 score:
0.011881913431552427
----------------DATASIZE: Medium ------------------
for L= 1  MMD^2 score:
0.00808000119872232
for L= 2  MMD^2 score:
0.007847838415810715
for L= 3  MMD^2 score:
0.002854229711848566
for L= 4  MMD^2 score:
0.0026202490229276787
for L= 5  MMD^2 score:
0.0030813828475186877
for L= 6  MMD^2 score:
0.0022849453894315808
----------------DATASIZE: Large ------------------
for L= 1  MMD^2 score:
0.008141649846514776
for L= 2  MMD^2 score:
0.0035073972497046135
for L= 3  MMD^2 score:
0.0015671430299024625
for L= 4  MMD^2 score:
0.0019851960641910082
for L= 5  MMD^2 score:
0.0013715822944770917
for L= 6  MMD^2 score:
0.0015351823895199956
In [ ]:
plot_dict_of_dicts(data=mmd_gmm_model_sqd_exp_kernel,
                   x_label="L values",
                   y_label="MMD_squared",
                   title="MMD_squared (with sqd_exp kernel) plot for GMM for dataset of different lengths ")

evaluating mmds for KDEs using squared exponential kernel¶

In [ ]:
mmd_kde_model_sqd_exp_kernel={}
bandwidth_for_kde=[0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4];keys_dataset=list(datasets.keys())
size_dataset=["Small","Medium","Large"]
for data_size in size_dataset:
  print("--------Data_size:",data_size,"-----------")
  X=datasets[data_size];temp={}
  for bw in bandwidth_for_kde:
    dataset_sampled_kde=dataset_kde[data_size][bw]
    temp[bw]=estimate_mmd_squared(samples1=dataset_sampled_kde,
                                           samples2=X, kernel_function=squared_exp_kernel)
    print("for bandwidth:",bw," MMD^2 score:",temp[bw])
  mmd_kde_model_sqd_exp_kernel[data_size]=temp
--------Data_size: Small -----------
for bandwidth: 0.01  MMD^2 score: 0.0227324569272509
for bandwidth: 0.05  MMD^2 score: 0.014870248884561788
for bandwidth: 0.1  MMD^2 score: 0.0152579907442425
for bandwidth: 0.15  MMD^2 score: 0.013652325888171557
for bandwidth: 0.2  MMD^2 score: 0.014246454194056346
for bandwidth: 0.25  MMD^2 score: 0.013828091300571055
for bandwidth: 0.3  MMD^2 score: 0.018295668526015052
for bandwidth: 0.35  MMD^2 score: 0.013389361599205007
for bandwidth: 0.4  MMD^2 score: 0.01629857467187279
--------Data_size: Medium -----------
for bandwidth: 0.01  MMD^2 score: 0.004230828587042268
for bandwidth: 0.05  MMD^2 score: 0.00229779593652113
for bandwidth: 0.1  MMD^2 score: 0.0034189463359119987
for bandwidth: 0.15  MMD^2 score: 0.0030637120826850772
for bandwidth: 0.2  MMD^2 score: 0.0037768409473978304
for bandwidth: 0.25  MMD^2 score: 0.004494465353473864
for bandwidth: 0.3  MMD^2 score: 0.004621511270823331
for bandwidth: 0.35  MMD^2 score: 0.007253995535964286
for bandwidth: 0.4  MMD^2 score: 0.00628754405726728
--------Data_size: Large -----------
for bandwidth: 0.01  MMD^2 score: 0.0016516745432757496
for bandwidth: 0.05  MMD^2 score: 0.0028152656201276827
for bandwidth: 0.1  MMD^2 score: 0.001332365306855099
for bandwidth: 0.15  MMD^2 score: 0.0015215272280293402
for bandwidth: 0.2  MMD^2 score: 0.0026557089785370636
for bandwidth: 0.25  MMD^2 score: 0.002927438603620436
for bandwidth: 0.3  MMD^2 score: 0.003200250134513638
for bandwidth: 0.35  MMD^2 score: 0.004580846148862894
for bandwidth: 0.4  MMD^2 score: 0.006475937707462465
In [ ]:
plot_dict_of_dicts(data=mmd_kde_model_sqd_exp_kernel,
                   x_label="bandwidth (h) for KDE",
                   y_label="MMD_squared",
                   title="MMD_squared (with sqd_exp kernel) plot for KDE for dataset of different lengths ")

Evaluating MMDS using inverse multi-quadratic fn¶

for GMM

In [ ]:
size_dataset=["Small","Medium","Large"];mmd_gmm_model_inv_quad={}
for data_size in size_dataset:
  print("----------------DATASIZE:",data_size,"------------------")
  X=datasets[data_size];
  temp={}
  for idx_el in range(1,7):
    print("L:",idx_el)
    dataset_sampled_gmm=dataset_for_GMM[data_size][idx_el]
    temp[idx_el]=estimate_mmd_squared(samples1=dataset_sampled_gmm,
                                           samples2=X, kernel_function=inverse_multi_quadratic_kernel)
    print("For L=",idx_el," MMD^2 score:",temp[idx_el])
  mmd_gmm_model_inv_quad[data_size]=temp
----------------DATASIZE: Small ------------------
L: 1
For L= 1  MMD^2 score: 0.02522100706992847
L: 2
For L= 2  MMD^2 score: 0.019936303817976953
L: 3
For L= 3  MMD^2 score: 0.02869786973074495
L: 4
For L= 4  MMD^2 score: 0.015868479819498127
L: 5
For L= 5  MMD^2 score: 0.01374438812674006
L: 6
For L= 6  MMD^2 score: 0.012066812679013106
----------------DATASIZE: Medium ------------------
L: 1
For L= 1  MMD^2 score: 0.013191802458515478
L: 2
For L= 2  MMD^2 score: 0.012626625454044249
L: 3
For L= 3  MMD^2 score: 0.0049995159821693
L: 4
For L= 4  MMD^2 score: 0.0032303076786441842
L: 5
For L= 5  MMD^2 score: 0.005232252075411892
L: 6
For L= 6  MMD^2 score: 0.002370310013506516
----------------DATASIZE: Large ------------------
L: 1
For L= 1  MMD^2 score: 0.01372832858219697
L: 2
For L= 2  MMD^2 score: 0.00682231846961745
L: 3
For L= 3  MMD^2 score: 0.003644204407087237
L: 4
For L= 4  MMD^2 score: 0.004022457051592765
L: 5
For L= 5  MMD^2 score: 0.0018976849608979274
L: 6
For L= 6  MMD^2 score: 0.0017011299403707492
In [ ]:
plot_dict_of_dicts(data=mmd_gmm_model_inv_quad,
                   x_label="L values",
                   y_label="MMD_squared",
                   title="MMD_squared (with inv-multi-quad kernel) plot for GMM for dataset of different lengths")

for KDE:

In [ ]:
mmd_kde_model_inv_quad={}
bandwidth_for_kde=[0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4];keys_dataset=list(datasets.keys())
size_dataset=["Small","Medium","Large"]
for data_size in size_dataset:
  print("--------Data_size:",data_size,"-----------")
  X=datasets[data_size];temp={}
  for bw in bandwidth_for_kde:
    dataset_sampled_kde=dataset_kde[data_size][bw]
    temp[bw]=estimate_mmd_squared(samples1=dataset_sampled_kde,
                                           samples2=X, kernel_function=inverse_multi_quadratic_kernel)
    print("for bandwidth:",bw," MMD^2 score:",temp[bw])
  mmd_kde_model_inv_quad[data_size]=temp
--------Data_size: Small -----------
for bandwidth: 0.01  MMD^2 score: 0.021025600080310625
for bandwidth: 0.05  MMD^2 score: 0.01418040320122893
for bandwidth: 0.1  MMD^2 score: 0.01425900388764445
for bandwidth: 0.15  MMD^2 score: 0.013629291631276352
for bandwidth: 0.2  MMD^2 score: 0.014214256636971845
for bandwidth: 0.25  MMD^2 score: 0.015443687141364015
for bandwidth: 0.3  MMD^2 score: 0.01853420968861652
for bandwidth: 0.35  MMD^2 score: 0.014411663277056563
for bandwidth: 0.4  MMD^2 score: 0.017518280459942748
--------Data_size: Medium -----------
for bandwidth: 0.01  MMD^2 score: 0.004301337898196311
for bandwidth: 0.05  MMD^2 score: 0.0023686525534216685
for bandwidth: 0.1  MMD^2 score: 0.003316046723267796
for bandwidth: 0.15  MMD^2 score: 0.0034443858511293834
for bandwidth: 0.2  MMD^2 score: 0.00376625869288183
for bandwidth: 0.25  MMD^2 score: 0.004871164915991488
for bandwidth: 0.3  MMD^2 score: 0.006049072647813292
for bandwidth: 0.35  MMD^2 score: 0.007820356559834263
for bandwidth: 0.4  MMD^2 score: 0.008314072709681763
--------Data_size: Large -----------
for bandwidth: 0.01  MMD^2 score: 0.0015842704000799124
for bandwidth: 0.05  MMD^2 score: 0.0024984858386801756
for bandwidth: 0.1  MMD^2 score: 0.0013548147448194658
for bandwidth: 0.15  MMD^2 score: 0.0017721989809129646
for bandwidth: 0.2  MMD^2 score: 0.0028602606360367266
for bandwidth: 0.25  MMD^2 score: 0.0034149266742496964
for bandwidth: 0.3  MMD^2 score: 0.004455007301260472
for bandwidth: 0.35  MMD^2 score: 0.005633445445235363
for bandwidth: 0.4  MMD^2 score: 0.007897413444463042
In [ ]:
plot_dict_of_dicts(data=mmd_kde_model_inv_quad,
                   x_label="bandwidth (h) for KDE",
                   y_label="MMD_squared",
                   title="MMD^{squared} (with inverse_multi_quadratic_kernel) for KDE for dataset of different lengths ")

Generic observations:¶

  1. In the case of both GMMs with small L (number of mixtures) and KDE with otherwise not good bandwidth (>0.25 in our case), having large number of data-points is favourable!

  2. We expected (and as also can be seen from the plots above) the single gaussian to act the worse (as can be seen from l=1 case in the GMM) as our dataset is quite 'multimodal' so to speak which it clearly cannot capture.

Comment
  1. We would like to point out that in our code, we missed providing the calculation for $mmd^{2}$ in the histogram and single Gaussian fit.

  2. We have calculated the $mmd^{2}$ caln (with different kernels) for both GMM and KDE.

  3. Although our results for GMM model are similar (in trend), we are not quite sure about the results for the KDE provided in the sample solution. What we observed was that for the KDE, bandwidth around 0.1 was the suitable and as we increased (and decreased) it, the performance of the KDE worsened. This is something we also expect as increasing the bandwidth results in oversmoothing, which gets clearly reflected in our plots, Whereas what the sample solution shows that increasing the bandwidth after 0.1 results in lowering of the value of MMD_squared, something we dont quite expect.

Comment

General Commnent for the task 1: Except for the histogram, the remaining three models were implemented and trained in the manner mostly similar to the sample solution. The sample solution is coded in a much more formal, Object-oriented format, whereas we adopted the modular approach for the sake of flexibility. Thouguh there were some improvements which we have provided above in the comments that would make our implementation more fast and robust.

2. Higher-dimensional data¶

Loading and splitting the data¶

In [ ]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

Let's load the dataset and split in train and test set and analyze how well the digits are distributed:

In [ ]:
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2)

# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_train)

# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()

We see, that the digits are in evenly distributed frequency in the train dataset

Comment
same data loading as sample solution with only difference in test set size

Define Maximum Mean Discrepancy¶

In [ ]:
def MMD(x, y, kernel):
    """Emprical maximum mean discrepancy. The lower the result
       the more evidence that distributions are the same.

    Args:
        x: first sample, distribution P
        y: second sample, distribution Q
        kernel: kernel type such as "rbf" or "inverse-multi-quadratic"
    """
    # Compute squared Euclidean distances

    xx, yy, zz = np.dot(x, x.T), np.dot(y, y.T), np.dot(x, y.T)
    rx = np.diag(xx).reshape(1, -1).repeat(xx.shape[0], axis=0)
    ry = np.diag(yy).reshape(1, -1).repeat(yy.shape[0], axis=0)

    dxx = rx.T + rx - 2. * xx  # Used for A in (1)
    dyy = ry.T + ry - 2. * yy  # Used for B in (1)
    dxy = rx.T + ry - 2. * zz  # Used for C in (1)

    XX, YY, XY = (np.zeros_like(xx),
                  np.zeros_like(xx),
                  np.zeros_like(xx))

    # Squared Exponential or Gaussian Kernel
    if kernel == "rbf":
        #bandwidth_range = [10, 15, 20, 50,100]
        bandwidth_range = [10, 15, 20, 25, 30]
        XX = sum(np.exp(-0.5 * dxx / a) for a in bandwidth_range)
        YY = sum(np.exp(-0.5 * dyy / a) for a in bandwidth_range)
        XY = sum(np.exp(-0.5 * dxy / a) for a in bandwidth_range)

    # Inverse Multi-Quadratic Kernel
    if kernel == "inverse-multi-quadratic":
        #bandwidth_range = [10, 15, 20, 50,100]
        bandwidth_range = [10, 15, 20, 25, 30]
        XX = sum(1 / (dxx / a + 1) for a in bandwidth_range)
        YY = sum(1 / (dyy / a + 1) for a in bandwidth_range)
        XY = sum(1 / (dxy / a + 1) for a in bandwidth_range)

    return np.mean(XX + YY - 2. * XY)
Comment

Readability and Modularity: The sample solution separates the kernel functions (squared_exponential_kernel and inverse_multi_quadratic_kernel) from the main MMD calculation function (mmd2). This makes the code more modular and potentially easier to read and maintain.

Performance: Our implementation (MMD function) uses vectorized operations and takes advantage of NumPy's capabilities for efficient array computations. This could lead to better performance, especially for large datasets, compared to the nested list comprehensions in the sample solution.

Density Forest¶

As a first generative network we try to implement the Density Forest implemtation from https://github.com/kfritsch/density_forest

In [ ]:
from DensityForest import DensityForest
X = X_train
density_forest = DensityForest(n_estimators=20)
density_forest.fit(X)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
c:\Users\luke\OneDrive\Dokumente\UniHeidelberg\Master\Semester3\Generative Neural Networks\code\Assignment_1-Feedback\generative-baselines-commented.ipynb Cell 63 line 1
----> <a href='vscode-notebook-cell:/c%3A/Users/luke/OneDrive/Dokumente/UniHeidelberg/Master/Semester3/Generative%20Neural%20Networks/code/Assignment_1-Feedback/generative-baselines-commented.ipynb#Y114sZmlsZQ%3D%3D?line=0'>1</a> from DensityForest import DensityForest
      <a href='vscode-notebook-cell:/c%3A/Users/luke/OneDrive/Dokumente/UniHeidelberg/Master/Semester3/Generative%20Neural%20Networks/code/Assignment_1-Feedback/generative-baselines-commented.ipynb#Y114sZmlsZQ%3D%3D?line=1'>2</a> X = X_train
      <a href='vscode-notebook-cell:/c%3A/Users/luke/OneDrive/Dokumente/UniHeidelberg/Master/Semester3/Generative%20Neural%20Networks/code/Assignment_1-Feedback/generative-baselines-commented.ipynb#Y114sZmlsZQ%3D%3D?line=2'>3</a> density_forest = DensityForest(n_estimators=20)

ModuleNotFoundError: No module named 'DensityForest'

After a few hours of bug fixing in the code, since it had semantic and syntax errors. We are now at a point, were the networks starts training, but we still run into an additional Error in the Node implementation. Since fixing the semantic error of the Node class and potential further errors and a seperate implementation of the sampling method would be over the scope of the first assignment we decided to skip the density forest implementation and will continue with the single gaussian.

Comment
The Sample Solution uses KD-Trees instead of the density Forest implementation where the link was given.

Single Gaussian¶

Now we implement a single Gaussian model, by using the 'GaussianMixture' model from scikit-learn with just one component

In [ ]:
n_components = 1
single_gaussian = GaussianMixture(n_components=n_components)
single_gaussian.fit(X_train)

new_samples, _ = single_gaussian.sample(len(X_test))
result_rbf = MMD(X_test,new_samples, "rbf")
result_inv = MMD(X_test,new_samples, "inverse-multi-quadratic")

print(f"MMD with squared exponential kernel: {result_rbf.item()}")
print(f"MMD with inverse multi quadratic kernel: {result_inv.item()}")

new_samples, _ = single_gaussian.sample(10)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(new_samples[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Sample {i}")
plt.show()
MMD with squared exponential kernel: 0.027877016283483855
MMD with inverse multi quadratic kernel: 0.03021700764199574

We can see that the single Gaussian is already capable of generating number like images, with some samples indicate legible numbers with higher noise. However, it clearly still lacks the capability to generate decent quality images in the majority of samples. Next, we will analyze the quality of generated dependend on the dataset size

In [ ]:
# Create a list of dataset sizes you want to use
dataset_sizes = [0.1, 0.3, 0.5, 0.8, 1.0]  # Fraction of the original dataset size
rbf_gaussian = []
inv_gaussian = []
var_data_gaussian = GaussianMixture(n_components=1)

for size in dataset_sizes:
    # Split the dataset into a training sets with the desired size
    if size == 1.0:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=None)
    else:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=1 - size)

    # Train the classifier on the current dataset size
    var_data_gaussian.fit(X)
    new_samples, _ = var_data_gaussian.sample(len(X))
    rbf_gaussian.append(MMD(X,new_samples, "rbf"))
    inv_gaussian.append(MMD(X,new_samples, "inverse-multi-quadratic"))
    # Create the plot


plt.figure(figsize=(10, 6))
plt.plot(dataset_sizes, rbf_gaussian, label="RBF MMD")
plt.plot(dataset_sizes, inv_gaussian, label="Inv. Multi-Quadratic MMD")

plt.title("MMD vs. Dataset Size")
plt.xlabel("dataset size")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)
c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(
c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.
  warnings.warn(

We see, the model improves the more training data it uses

Comment
Single Gaussian apparently did not needed to be implemented

Gaussian mixture model (GMM)¶

Let's continue with training a Gaussian mixture model (GMM) and start by using 50 gaussian components

In [ ]:
n_components = 55
gmm = GaussianMixture(n_components=n_components)
gmm.fit(X_train)
new_samples, _ = gmm.sample(len(X_test))
result_rbf = MMD(X_test,new_samples, "rbf")
result_inv = MMD(X_test,new_samples, "inverse-multi-quadratic")

print(f"MMD with squared exponential kernel: {result_rbf.item()}")
print(f"MMD with inverse multi quadratic kernel: {result_inv.item()}")

# plot generated images
new_samples, _ = gmm.sample(10)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(new_samples[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Sample {i}")
plt.show()
MMD with squared exponential kernel: 0.0278568518776322
MMD with inverse multi quadratic kernel: 0.028092643664431385

Now we investigate the hyperparameter number of components (n_components) in an grid search attempt to find a good fit

In [ ]:
n_components = list(range(5, 101, 5))
rbf_mmd = []
inv_mmd = []
for _, n in enumerate(n_components):
    n_gmm = GaussianMixture(n_components=n)
    n_gmm.fit(X_train)
    new_samples, _ = n_gmm.sample(len(X_test))
    rbf_mmd.append(MMD(X_test,new_samples, "rbf"))
    inv_mmd.append(MMD(X_test,new_samples, "inverse-multi-quadratic"))

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(n_components, rbf_mmd, label="RBF MMD")
plt.plot(n_components, inv_mmd, label="Inv. Multi-Quadratic MMD")

plt.title("MMD vs. n_components")
plt.xlabel("n_components")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)

plt.show()

Furthe we test the influence of the dataset size with the optimal n_component size

In [ ]:
# Create a list of dataset sizes you want to use
dataset_sizes = [0.1, 0.3, 0.5, 0.8, 1.0]  # Fraction of the original dataset size
rbf_gmm = []
inv_gmm = []
var_data_gmm = GaussianMixture(n_components=55)

for size in dataset_sizes:
    # Split the dataset into a training sets with the desired size
    if size == 1.0:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=None)
    else:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=1 - size)

    # Train the classifier on the current dataset size
    var_data_gmm.fit(X)
    new_samples, _ = var_data_gmm.sample(len(X))
    rbf_gmm.append(MMD(X,new_samples, "rbf"))
    inv_gmm.append(MMD(X,new_samples, "inverse-multi-quadratic"))
    # Create the plot


plt.figure(figsize=(10, 6))
plt.plot(dataset_sizes, rbf_gmm, label="RBF MMD")
plt.plot(dataset_sizes, inv_gmm, label="Inv. Multi-Quadratic MMD")

plt.title("MMD vs. Dataset Size")
plt.xlabel("dataset size")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)
c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(
c:\Users\luke\anaconda3\envs\ml_homework\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.
  warnings.warn(

Also here the model improves with dataset size

Comment
Apperently the dataset size did not need to be analyzed. Implementation is correct und resembles sample solution. Only MMD values seem to be different and potentially incorrect. Sample solution misses the testing of different number of components

Kernel Density Estimator (KDE)¶

Now we implement a kernel density estimator (KDE) with squared exponential kernel

In [ ]:
from sklearn.neighbors import KernelDensity
bandwidth = 1.1
kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
kde.fit(X_train)
new_samples = kde.sample(len(X_test))
result_rbf = MMD(X_test,new_samples, "rbf")
result_inv = MMD(X_test,new_samples, "inverse-multi-quadratic")

print(f"MMD with squared exponential kernel: {result_rbf.item()}")
print(f"MMD with inverse multi quadratic kernel: {result_inv.item()}")

new_samples = kde.sample(10)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(new_samples[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Sample {i}")
plt.show()
MMD with squared exponential kernel: 0.02797107095274239
MMD with inverse multi quadratic kernel: 0.02836541329836265
In [ ]:
bandwidths = np.arange(0.1, 2.6, 0.1)
rbf_mmd = []
inv_mmd = []
for _, bandwidth in enumerate(bandwidths):
    n_kde = KernelDensity(bandwidth=bandwidth, kernel='gaussian')
    n_kde.fit(X_train)
    new_samples = n_kde.sample(len(X_test))
    rbf_mmd.append(MMD(X_test,new_samples, "rbf"))
    inv_mmd.append(MMD(X_test,new_samples, "inverse-multi-quadratic"))

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(bandwidths, rbf_mmd, label="RBF MMD")
plt.plot(bandwidths, inv_mmd, label="Inv. Multi-Quadratic MMD")

plt.title("MMD vs. bandwidth")
plt.xlabel("bandwith")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)

plt.show()
In [ ]:
# Create a list of dataset sizes you want to use
dataset_sizes = [0.1, 0.3, 0.5, 0.8, 1.0]  # Fraction of the original dataset size
rbf_kde = []
inv_kde = []
var_data_gaussian = KernelDensity(bandwidth=1.1, kernel='gaussian')

for size in dataset_sizes:
    # Split the dataset into a training sets with the desired size
    if size == 1.0:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=None)
    else:
        X, _, y, _ = train_test_split(X_train, y_train, test_size=1 - size)

    # Train the classifier on the current dataset size
    var_data_gaussian.fit(X)
    new_samples = var_data_gaussian.sample(len(X))
    rbf_kde.append(MMD(X,new_samples, "rbf"))
    inv_kde.append(MMD(X,new_samples, "inverse-multi-quadratic"))
    # Create the plot


plt.figure(figsize=(10, 6))
plt.plot(dataset_sizes, rbf_kde, label="RBF MMD")
plt.plot(dataset_sizes, inv_kde, label="Inv. Multi-Quadratic MMD")

plt.title("MMD vs. Dataset Size")
plt.xlabel("dataset size")
plt.ylabel("MMD")
plt.legend()
plt.grid(True)
Comment
Apperently the dataset size did not need to be analyzed. Implementation is correct und resembles sample solution. Only MMD values seem to be different and potentially incorrect. Sample solution misses analysis of bandwidths

Random Forest Classifier¶

Finally we implement a Random forest classifier on the original dataset to destinguish the 10 digits. Further we will use this classifier to check if the models are working reasonably and that the 10 digits are generated in equal proportion

In [ ]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train, y_train)
Out[ ]:
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [ ]:
from sklearn.metrics import accuracy_score

y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test data: {accuracy:.2f}")
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(X_test[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"True: {y_test[i]} Pred: {y_pred[i]}")
plt.show()

y_pred = rf_classifier.predict(X_train)
# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred)

# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers in the Train Dataset")
plt.show()
Accuracy on the test data: 0.99

We can see, the Classifier does a really good job on classifying the test dataset. Also the frequency of each predicted class in the training dataset is very evanly spaced out. So now, lets test the classifier on generated data samples from each model!

In [ ]:
sample_size = 500
samples_gauss, _ = single_gaussian.sample(sample_size)
samples_gmm, _ = gmm.sample(sample_size)
samples_kde = kde.sample(sample_size)


y_pred_gauss = rf_classifier.predict(samples_gauss)

fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(samples_gauss[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Pred: {y_pred_gauss[i]}")
fig.suptitle("Predictions for Single Gaussian Samples", fontsize=16)
plt.show()

# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred_gauss)

# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()

y_pred_gmm = rf_classifier.predict(samples_gmm)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(samples_gmm[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Pred: {y_pred_gmm[i]}")
fig.suptitle("Predictions for GMM Samples", fontsize=16)
plt.show()

# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred_gmm)

# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()

y_pred_kde = rf_classifier.predict(samples_kde)
fig, axes = plt.subplots(2, 5, figsize=(10, 5))
for i, ax in enumerate(axes.ravel()):
    ax.imshow(samples_kde[i].reshape(8, 8), cmap=plt.cm.gray)
    ax.set_title(f"Pred: {y_pred_kde[i]}")
fig.suptitle("Predictions for GMM Samples", fontsize=16)
plt.show()

# Count the frequency of each number from 0 to 9 in y_pred_gauss
count = np.bincount(y_pred_kde)

# Create a bar chart to visualize the frequencies
plt.bar(range(10), count)
plt.xticks(range(10))
plt.xlabel("Predicted Numbers")
plt.ylabel("Frequency")
plt.title("Frequency of Predicted Numbers")
plt.show()

From the plot we can see, that the single Gaussian performs the worst on the training dataset since a disproportinal amount of the generated images get classified as the digit 8. The GMM and the KDE both have substentially better results, where the generated images are more evanly classified throughout the classes, suggesting better generated data samples

Comment
Random Tree Classifier implementation is exactly like the sample solution. Analysis of KDE and GMM are also the same. Only KD Tree analysis is missing. Single Gaussian was unnecassarry

Comment
¶

General comment for task 2: The task was similary solved with similar performance only missing the Density Tree implementation. The sample soultion is more readable and significantly reduces code length since using for loops for all models. However the sample solution is able to achieve this, because it does not implement the testing of different dataset sizes and bandwidth and n_component sizes